{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Feature Engineering and Labeling\n",
    "\n",
    "We'll use the price-volume data and generate features that we can feed into a model.  We'll use this notebook for all the coding exercises of this lesson, so please open this notebook in a separate tab of your browser.  \n",
    "\n",
    "Please run the following code up to and including \"Make Factors.\"  Then continue on with the lesson."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import sys\n",
    "!{sys.executable} -m pip install --quiet -r requirements.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import time\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.style.use('ggplot')\n",
    "plt.rcParams['figure.figsize'] = (14, 8)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Registering data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import project_helper\n",
    "from zipline.data import bundles\n",
    "\n",
    "os.environ['ZIPLINE_ROOT'] = os.path.join(os.getcwd(), '..', '..', 'data', 'project_4_eod')\n",
    "\n",
    "ingest_func = bundles.csvdir.csvdir_equities(['daily'], project_helper.EOD_BUNDLE_NAME)\n",
    "bundles.register(project_helper.EOD_BUNDLE_NAME, ingest_func)\n",
    "\n",
    "print('Data Registered')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from zipline.pipeline import Pipeline\n",
    "from zipline.pipeline.factors import AverageDollarVolume\n",
    "from zipline.utils.calendars import get_calendar\n",
    "\n",
    "\n",
    "universe = AverageDollarVolume(window_length=120).top(500) \n",
    "trading_calendar = get_calendar('NYSE') \n",
    "bundle_data = bundles.load(project_helper.EOD_BUNDLE_NAME)\n",
    "engine = project_helper.build_pipeline_engine(bundle_data, trading_calendar)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "universe_end_date = pd.Timestamp('2016-01-05', tz='UTC')\n",
    "\n",
    "universe_tickers = engine\\\n",
    "    .run_pipeline(\n",
    "        Pipeline(screen=universe),\n",
    "        universe_end_date,\n",
    "        universe_end_date)\\\n",
    "    .index.get_level_values(1)\\\n",
    "    .values.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from zipline.data.data_portal import DataPortal\n",
    "\n",
    "data_portal = DataPortal(\n",
    "    bundle_data.asset_finder,\n",
    "    trading_calendar=trading_calendar,\n",
    "    first_trading_day=bundle_data.equity_daily_bar_reader.first_trading_day,\n",
    "    equity_minute_reader=None,\n",
    "    equity_daily_reader=bundle_data.equity_daily_bar_reader,\n",
    "    adjustment_reader=bundle_data.adjustment_reader)\n",
    "\n",
    "def get_pricing(data_portal, trading_calendar, assets, start_date, end_date, field='close'):\n",
    "    end_dt = pd.Timestamp(end_date.strftime('%Y-%m-%d'), tz='UTC', offset='C')\n",
    "    start_dt = pd.Timestamp(start_date.strftime('%Y-%m-%d'), tz='UTC', offset='C')\n",
    "\n",
    "    end_loc = trading_calendar.closes.index.get_loc(end_dt)\n",
    "    start_loc = trading_calendar.closes.index.get_loc(start_dt)\n",
    "\n",
    "    return data_portal.get_history_window(\n",
    "        assets=assets,\n",
    "        end_dt=end_dt,\n",
    "        bar_count=end_loc - start_loc,\n",
    "        frequency='1d',\n",
    "        field=field,\n",
    "        data_frequency='daily')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Make Factors\n",
    "\n",
    "- We'll use the same factors we have been using in the lessons about alpha factor research.  Factors can be features that we feed into the model.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from zipline.pipeline.factors import CustomFactor, DailyReturns, Returns, SimpleMovingAverage\n",
    "from zipline.pipeline.data import USEquityPricing\n",
    "\n",
    "factor_start_date = universe_end_date - pd.DateOffset(years=3, days=2)\n",
    "sector = project_helper.Sector()\n",
    "\n",
    "def momentum_1yr(window_length, universe, sector):\n",
    "    return Returns(window_length=window_length, mask=universe) \\\n",
    "        .demean(groupby=sector) \\\n",
    "        .rank() \\\n",
    "        .zscore()\n",
    "\n",
    "def mean_reversion_5day_sector_neutral(window_length, universe, sector):\n",
    "    return -Returns(window_length=window_length, mask=universe) \\\n",
    "        .demean(groupby=sector) \\\n",
    "        .rank() \\\n",
    "        .zscore()\n",
    "\n",
    "def mean_reversion_5day_sector_neutral_smoothed(window_length, universe, sector):\n",
    "    unsmoothed_factor = mean_reversion_5day_sector_neutral(window_length, universe, sector)\n",
    "    return SimpleMovingAverage(inputs=[unsmoothed_factor], window_length=window_length) \\\n",
    "        .rank() \\\n",
    "        .zscore()\n",
    "\n",
    "class CTO(Returns):\n",
    "    \"\"\"\n",
    "    Computes the overnight return, per hypothesis from\n",
    "    https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2554010\n",
    "    \"\"\"\n",
    "    inputs = [USEquityPricing.open, USEquityPricing.close]\n",
    "    \n",
    "    def compute(self, today, assets, out, opens, closes):\n",
    "        \"\"\"\n",
    "        The opens and closes matrix is 2 rows x N assets, with the most recent at the bottom.\n",
    "        As such, opens[-1] is the most recent open, and closes[0] is the earlier close\n",
    "        \"\"\"\n",
    "        out[:] = (opens[-1] - closes[0]) / closes[0]\n",
    "\n",
    "        \n",
    "class TrailingOvernightReturns(Returns):\n",
    "    \"\"\"\n",
    "    Sum of trailing 1m O/N returns\n",
    "    \"\"\"\n",
    "    window_safe = True\n",
    "    \n",
    "    def compute(self, today, asset_ids, out, cto):\n",
    "        out[:] = np.nansum(cto, axis=0)\n",
    "\n",
    "        \n",
    "def overnight_sentiment(cto_window_length, trail_overnight_returns_window_length, universe):\n",
    "    cto_out = CTO(mask=universe, window_length=cto_window_length)\n",
    "    return TrailingOvernightReturns(inputs=[cto_out], window_length=trail_overnight_returns_window_length) \\\n",
    "        .rank() \\\n",
    "        .zscore()\n",
    "\n",
    "def overnight_sentiment_smoothed(cto_window_length, trail_overnight_returns_window_length, universe):\n",
    "    unsmoothed_factor = overnight_sentiment(cto_window_length, trail_overnight_returns_window_length, universe)\n",
    "    return SimpleMovingAverage(inputs=[unsmoothed_factor], window_length=trail_overnight_returns_window_length) \\\n",
    "        .rank() \\\n",
    "        .zscore()\n",
    "\n",
    "universe = AverageDollarVolume(window_length=120).top(500)\n",
    "sector = project_helper.Sector()\n",
    "\n",
    "pipeline = Pipeline(screen=universe)\n",
    "pipeline.add(\n",
    "    momentum_1yr(252, universe, sector),\n",
    "    'Momentum_1YR')\n",
    "pipeline.add(\n",
    "    mean_reversion_5day_sector_neutral_smoothed(20, universe, sector),\n",
    "    'Mean_Reversion_Sector_Neutral_Smoothed')\n",
    "pipeline.add(\n",
    "    overnight_sentiment_smoothed(2, 10, universe),\n",
    "    'Overnight_Sentiment_Smoothed')\n",
    "\n",
    "all_factors = engine.run_pipeline(pipeline, factor_start_date, universe_end_date)\n",
    "\n",
    "all_factors.head()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Stop here and continue with the lesson section titled \"Features\"."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Universal Quant Features\n",
    "\n",
    "* stock volatility: zipline has a custom factor called AnnualizedVolatility.  The [source code is here](https://github.com/quantopian/zipline/blob/master/zipline/pipeline/factors/basic.py) and also pasted below:\n",
    "\n",
    "```\n",
    "class AnnualizedVolatility(CustomFactor):\n",
    "    \"\"\"\n",
    "    Volatility. The degree of variation of a series over time as measured by\n",
    "    the standard deviation of daily returns.\n",
    "    https://en.wikipedia.org/wiki/Volatility_(finance)\n",
    "    **Default Inputs:** :data:`zipline.pipeline.factors.Returns(window_length=2)`  # noqa\n",
    "    Parameters\n",
    "    ----------\n",
    "    annualization_factor : float, optional\n",
    "        The number of time units per year. Defaults is 252, the number of NYSE\n",
    "        trading days in a normal year.\n",
    "    \"\"\"\n",
    "    inputs = [Returns(window_length=2)]\n",
    "    params = {'annualization_factor': 252.0}\n",
    "    window_length = 252\n",
    "\n",
    "    def compute(self, today, assets, out, returns, annualization_factor):\n",
    "        out[:] = nanstd(returns, axis=0) * (annualization_factor ** .5)\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from zipline.pipeline.factors import AnnualizedVolatility\n",
    "AnnualizedVolatility()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Quiz\n",
    "We can see that the returns `window_length` is 2, because we're dealing with daily returns, which are calculated as the percent change from one day to the following day (2 days).  The `AnnualizedVolatility` `window_length` is 252 by default, because it's the one-year volatility.  Try to adjust the call to the constructor of `AnnualizedVolatility` so that this represents one-month volatility (still annualized, but calculated over a time window of 20 trading days)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Answer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TODO\n",
    "AnnualizedVolatility(window_length=20)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Quiz: Create one-month and six-month annualized volatility.\n",
    "Create `AnnualizedVolatility` objects for 20 day and 120 day (one month and six-month) time windows.  Remember to set the `mask` parameter to the `universe` object created earlier (this filters the stocks to match the list in the `universe`).  Convert these to ranks, and then convert the ranks to zscores."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TODO\n",
    "volatility_20d = AnnualizedVolatility(window_length=20, mask=universe).rank().zscore()\n",
    "volatility_120d = AnnualizedVolatility(window_length=120, mask=universe).rank().zscore()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Add to the pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pipeline.add(volatility_20d, 'volatility_20d')\n",
    "pipeline.add(volatility_120d, 'volatility_120d')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Quiz: Average Dollar Volume feature\n",
    "We've been using [AverageDollarVolume](http://www.zipline.io/appendix.html#zipline.pipeline.factors.AverageDollarVolume) to choose the stock universe based on stocks that have the highest dollar volume.  We can also use it as a feature that is input into a predictive model.  \n",
    "Use 20 day and 120 day `window_length` for average dollar volume.  Then rank it and convert to a zscore."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"already imported earlier, but shown here for reference\"\"\"\n",
    "#from zipline.pipeline.factors import AverageDollarVolume \n",
    "\n",
    "# TODO: 20-day and 120 day average dollar volume\n",
    "adv_20d = AverageDollarVolume(window_length=20, mask=universe).rank().zscore()\n",
    "adv_120d = AverageDollarVolume(window_length=120, mask=universe).rank().zscore()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Add average dollar volume features to pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pipeline.add(adv_20d, 'adv_20d')\n",
    "pipeline.add(adv_120d, 'adv_120d')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Market Regime Features\n",
    "We are going to try to capture market-wide regimes:  Market-wide means we'll look at the aggregate movement of the universe of stocks.\n",
    "\n",
    "High and low dispersion: dispersion is looking at the dispersion (standard deviation) of the cross section of all stocks at each period of time (on each day).  We'll inherit from [CustomFactor](http://www.zipline.io/appendix.html?highlight=customfactor#zipline.pipeline.CustomFactor).  We'll feed in [DailyReturns](http://www.zipline.io/appendix.html?highlight=dailyreturns#zipline.pipeline.factors.DailyReturns) as the `inputs`.  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Quiz\n",
    "If the `inputs` to our market dispersion factor are the daily returns, and we plan to calculate the market dispersion on each day, what should be the `window_length` of the market dispersion class?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Answer\n",
    "window_length = 1, because each row of the input data represents returns (not stock prices)."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Quiz: market dispersion feature\n",
    "Create a class that inherits from `CustomFactor`.  Override the `compute` function to calculate the population standard deviation of all the stocks over a specified window of time.\n",
    "\n",
    "Calculate the mean returns\n",
    "\n",
    "$\\mu = \\sum_{t=0}^{T}\\sum_{i=1}^{N}r_{i,t}$\n",
    "\n",
    "$\\sqrt{\\frac{1}{T} \\sum_{t=0}^{T}  \\frac{1}{N}\\sum_{i=1}^{N}(r_{i,t} - \\mu)^2}$\n",
    "\n",
    "Use [numpy.nanmean](https://docs.scipy.org/doc/numpy-1.15.0/reference/generated/numpy.nanmean.html) to calculate the average market return $\\mu$ and to calculate the average of the squared differences.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class MarketDispersion(CustomFactor):\n",
    "    inputs = [DailyReturns()]\n",
    "    window_length = 1\n",
    "    window_safe = True\n",
    "\n",
    "    def compute(self, today, assets, out, returns):\n",
    "        \n",
    "        # TODO: calculate average returns\n",
    "        mean_returns = np.nanmean(returns)\n",
    "        \n",
    "        #TODO: calculate standard deviation of returns\n",
    "        out[:] = np.sqrt(np.nanmean((returns - mean_returns)**2))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Quiz\n",
    "\n",
    "Create the MarketDispersion object.  Apply two separate smoothing operations using [SimpleMovingAverage](https://www.zipline.io/appendix.html?highlight=simplemovingaverage#zipline.pipeline.factors.SimpleMovingAverage).  One with a one-month window, and another with a 6-month window.  Add both to the pipeline."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Answer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TODO: create MarketDispersion object\n",
    "dispersion = MarketDispersion(mask=universe)\n",
    "\n",
    "# TODO: apply one-month simple moving average\n",
    "dispersion_20d = SimpleMovingAverage(inputs=[dispersion], window_length=20)\n",
    "\n",
    "# TODO: apply 6-month simple moving average\n",
    "dispersion_120d = SimpleMovingAverage(inputs=[dispersion], window_length=120)\n",
    "\n",
    "# Add to pipeline\n",
    "pipeline.add(dispersion_20d, 'dispersion_20d')\n",
    "pipeline.add(dispersion_120d, 'dispersion_120d')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Market volatility feature\n",
    "* High and low volatility  \n",
    "We'll also build a class for market volatility, which inherits from [CustomFactor](http://www.zipline.io/appendix.html?highlight=customfactor#zipline.pipeline.CustomFactor).  This will measure the standard deviation of the returns of the \"market\".  In this case, we're approximating the \"market\" as the equal weighted average return of all the stocks in the stock universe.\n",
    "\n",
    "##### Market return\n",
    "$r_{m,t} = \\frac{1}{N}\\sum_{i=1}^{N}r_{i,t}$ for each day $t$ in `window_length`.  \n",
    "\n",
    "##### Average market return\n",
    "Also calculate the average market return over the `window_length` $T$ of days:  \n",
    "$\\mu_{m} = \\frac{1}{T}\\sum_{t=1}^{T} r_{m,t}$\n",
    "\n",
    "#### Standard deviation of market return\n",
    "Then calculate the standard deviation of the market return  \n",
    "$\\sigma_{m,t} = \\sqrt{252 \\times \\frac{1}{N} \\sum_{t=1}^{T}(r_{m,t} - \\mu_{m})^2 } $ \n",
    "\n",
    "##### Hints\n",
    "* Please use [numpy.nanmean](https://docs.scipy.org/doc/numpy-1.15.0/reference/generated/numpy.nanmean.html) so that it ignores null values.\n",
    "* When using `numpy.nanmean`:  \n",
    "axis=0 will calculate one average for every column (think of it like creating a new row in a spreadsheet)  \n",
    "axis=1 will calculate one average for every row (think of it like creating a new column in a spreadsheet)  \n",
    "* The returns data in `compute` has one day in each row, and one stock in each column.\n",
    "* Notice that we defined a dictionary `params` that has a key `annualization_factor`.  This `annualization_factor` can be used as a regular variable, and you'll be using it in the `compute` function.  This is also done in the definition of AnnualizedVolatility (as seen earlier in the notebook)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class MarketVolatility(CustomFactor):\n",
    "    inputs = [DailyReturns()]\n",
    "    window_length = 1  # We'll want to set this in the constructor when creating the object.\n",
    "    window_safe = True\n",
    "    params = {'annualization_factor': 252.0}\n",
    "    \n",
    "    def compute(self, today, assets, out, returns, annualization_factor):\n",
    "        \n",
    "        # TODO\n",
    "        \"\"\" \n",
    "        For each row (each row represents one day of returns), \n",
    "        calculate the average of the cross-section of stock returns\n",
    "        So that market_returns has one value for each day in the window_length\n",
    "        So choose the appropriate axis (please see hints above)\n",
    "        \"\"\"\n",
    "        mkt_returns = np.nanmean(returns, axis=1) \n",
    "        \n",
    "        # TODO\n",
    "        # Calculate the mean of market returns\n",
    "        mkt_returns_mu = np.nanmean(mkt_returns)\n",
    "        \n",
    "        # TODO\n",
    "        # Calculate the standard deviation of the market returns, then annualize them.\n",
    "        out[:] = np.sqrt(annualization_factor * np.nanmean((mkt_returns-mkt_returns_mu)**2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TODO: create market volatility features using one month and six-month windows\n",
    "market_vol_20d = MarketVolatility(window_length=20, mask=universe)\n",
    "market_vol_120d = MarketVolatility(window_length=120, mask=universe)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# add market volatility features to pipeline\n",
    "pipeline.add(market_vol_20d, 'market_vol_20d')\n",
    "pipeline.add(market_vol_120d, 'market_vol_120d')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Stop here and continue with the lesson section \"Sector and Industry\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sector and Industry"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Add sector code\n",
    "\n",
    "Note that after we run the pipeline and get the data in a dataframe, we can work on enhancing the sector code feature with one-hot encoding."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pipeline.add(sector, 'sector_code')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Run pipeline to calculate features\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_factors = engine.run_pipeline(pipeline, factor_start_date, universe_end_date)\n",
    "all_factors.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### One-hot encode sector\n",
    "\n",
    "Let's get all the unique sector codes.  Then we'll use the `==` comparison operator to check when the sector code equals a particular value.  This returns a series of True/False values.  For some functions that we'll use in a later lesson, it's easier to work with numbers instead of booleans.  We can convert the booleans to type int.  So False becomes 0, and 1 becomes True."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sector_code_l = set(all_factors['sector_code'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sector_0 = all_factors['sector_code'] == 0\n",
    "sector_0[0:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sector_0_numeric = sector_0.astype(int)\n",
    "sector_0_numeric[0:5]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Quiz: One-hot encode sector\n",
    "Choose column names that look like \"sector_code_0\", \"sector_code_1\" etc.  Store the values as 1 when the row matches the sector code of the column, 0 otherwise."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TODO: one-hot encode sector and store into dataframe\n",
    "for s in sector_code_l:\n",
    "    all_factors[f'sector_code_{s}'] = (all_factors['sector_code'] == s).astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_factors.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Stop here and continue with the lesson section \"Date Parts\"."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Date Parts\n",
    "* We will make features that might capture trader/investor behavior due to calendar anomalies.\n",
    "* We can get the dates from the index of the dataframe that is returned from running the pipeline."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Accessing index of dates\n",
    "* Note that we can access the date index. using `Dataframe.index.get_level_values(0)`, since the date is stored as index level 0, and the asset name is stored in index level 1.  This is of type [DateTimeIndex](https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.DatetimeIndex.html)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_factors.index.get_level_values(0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### [DateTimeIndex attributes](https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.DatetimeIndex.html)\n",
    "\n",
    "* The `month` attribute is a numpy array with a 1 for January, 2 for February ... 12 for December etc.  \n",
    "* We can use a comparison operator such as `==` to return True or False.\n",
    "\n",
    "* It's usually easier to have all data of a similar type (numeric), so we recommend converting booleans to integers.  \n",
    "The numpy ndarray has a function `.astype()` that can cast the data to a specified type.  \n",
    "For instance, `astype(int)` converts False to 0 and True to 1.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Example\n",
    "print(all_factors.index.get_level_values(0).month)\n",
    "print(all_factors.index.get_level_values(0).month == 1)\n",
    "print( (all_factors.index.get_level_values(0).month == 1).astype(int) )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Quiz\n",
    "* Create a numpy array that has 1 when the month is January, and 0 otherwise.  Store it as a column in the all_factors dataframe.\n",
    "* Add another similar column to indicate when the month is December"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TODO: create a feature that indicate whether it's January\n",
    "all_factors['is_January'] = (all_factors.index.get_level_values(0).month == 1).astype(int)\n",
    "\n",
    "# TODO: create a feature to indicate whether it's December\n",
    "all_factors['is_December'] = (all_factors.index.get_level_values(0).month == 12).astype(int)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Weekday, quarter\n",
    "* add columns to the all_factors dataframe that specify the weekday, quarter and year.\n",
    "* As you can see in the [documentation for DateTimeIndex](https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.DatetimeIndex.html), `weekday`, `quarter`, and `year` are attributes that you can use here."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# we can see that 0 is for Monday, 4 is for Friday\n",
    "set(all_factors.index.get_level_values(0).weekday)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Q1, Q2, Q3 and Q4 are represented by integers too\n",
    "set(all_factors.index.get_level_values(0).quarter)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Quiz\n",
    "Add features for weekday, quarter and year."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TODO\n",
    "all_factors['weekday'] = all_factors.index.get_level_values(0).weekday\n",
    "all_factors['quarter'] = all_factors.index.get_level_values(0).quarter\n",
    "all_factors['year'] = all_factors.index.get_level_values(0).year"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Start and end-of features\n",
    "\n",
    "* The start and end of the week, month, and quarter may have structural differences in trading activity.\n",
    "* [Pandas.date_range](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.date_range.html) takes the start_date, end_date, and frequency.\n",
    "* The [frequency](http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases) for end of month is `BM`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Example\n",
    "tmp = pd.date_range(start=factor_start_date, end=universe_end_date, freq='BM')\n",
    "tmp"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Example\n",
    "\n",
    "Create a DatetimeIndex that stores the dates which are the last business day of each month.  \n",
    "Use the `.isin` function, passing in these last days of the month, to create a series of booleans.  \n",
    "Convert the booleans to integers.  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "last_day_of_month = pd.date_range(start=factor_start_date, end=universe_end_date, freq='BM')\n",
    "last_day_of_month"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tmp_month_end = all_factors.index.get_level_values(0).isin(last_day_of_month)\n",
    "tmp_month_end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tmp_month_end_int = tmp_month_end.astype(int)\n",
    "tmp_month_end_int"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_factors['month_end'] = tmp_month_end_int"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Quiz: Start of Month\n",
    "Create a feature that indicates the first business day of each month.\n",
    "\n",
    "**Hint:** The frequency for first business day of the month uses the code `BMS`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TODO: month_start feature\n",
    "first_day_of_month = pd.date_range(start=factor_start_date, end=universe_end_date, freq='BMS')\n",
    "all_factors['month_start'] = (all_factors.index.get_level_values(0).isin(first_day_of_month)).astype(int)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Quiz: Quarter end and quarter start\n",
    "\n",
    "Create features for the last business day of each quarter, and first business day of each quarter.  \n",
    "**Hint**: use `freq=BQ` for business day end of quarter, and `freq=BQS` for business day start of quarter."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TODO: qtr_end feature\n",
    "last_day_qtr = pd.date_range(start=factor_start_date, end=universe_end_date, freq='BQ')\n",
    "all_factors['qtr_end'] = (all_factors.index.get_level_values(0).isin(last_day_qtr)).astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TODO: qtr_start feature\n",
    "first_day_qtr = pd.date_range(start=factor_start_date, end=universe_end_date, freq='BQS')\n",
    "all_factors['qtr_start'] = (all_factors.index.get_level_values(0).isin(first_day_qtr)).astype(int)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## View all features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "list(all_factors.columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note that we can skip the sector_code feature, since we one-hot encoded it into separate features."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "features = ['Mean_Reversion_Sector_Neutral_Smoothed',\n",
    " 'Momentum_1YR',\n",
    " 'Overnight_Sentiment_Smoothed',\n",
    " 'adv_120d',\n",
    " 'adv_20d',\n",
    " 'dispersion_120d',\n",
    " 'dispersion_20d',\n",
    " 'market_vol_120d',\n",
    " 'market_vol_20d',\n",
    " #'sector_code', # removed sector_code\n",
    " 'volatility_120d',\n",
    " 'volatility_20d',\n",
    " 'sector_code_0',\n",
    " 'sector_code_1',\n",
    " 'sector_code_2',\n",
    " 'sector_code_3',\n",
    " 'sector_code_4',\n",
    " 'sector_code_5',\n",
    " 'sector_code_6',\n",
    " 'sector_code_7',\n",
    " 'sector_code_8',\n",
    " 'sector_code_9',\n",
    " 'sector_code_10',\n",
    " 'sector_code_-1',\n",
    " 'is_January',\n",
    " 'is_December',\n",
    " 'weekday',\n",
    " 'quarter',\n",
    " 'year',\n",
    " 'month_start',\n",
    " 'qtr_end',\n",
    " 'qtr_start']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Stop here and continue to the lesson section \"Targets\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Targets (Labels)\n",
    "\n",
    "- We are going to try to predict the go forward 1-week return\n",
    "- Very important! Quantize the target. Why do we do this?\n",
    "  - Makes it market neutral return\n",
    "  - Normalizes changing volatility and dispersion over time\n",
    "  - Make the target robust to changes in market regimes\n",
    "- The factor we create is the trailing 5-day return\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# we'll create a separate pipeline to handle the target\n",
    "pipeline_target = Pipeline(screen=universe)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Example\n",
    "\n",
    "We'll convert weekly returns into 2-quantiles."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "return_5d_2q = Returns(window_length=5, mask=universe).quantiles(2)\n",
    "return_5d_2q"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pipeline_target.add(return_5d_2q, 'return_5d_2q')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Quiz\n",
    "Create another weekly return target that's converted to 5-quantiles."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TODO: create a target using 5-quantiles\n",
    "return_5d_5q = Returns(window_length=5, mask=universe).quantiles(5)\n",
    "\n",
    "# TODO: add the feature to the pipeline\n",
    "pipeline_target.add(return_5d_2q, 'return_5d_5q')\n",
    "\n",
    "# Let's run the pipeline to get the dataframe\n",
    "targets_df = engine.run_pipeline(pipeline_target, factor_start_date, universe_end_date)\n",
    "targets_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "targets_df.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Solution\n",
    "\n",
    "[solution notebook](feature_engineering_solution.ipynb)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "env_p7a",
   "language": "python",
   "name": "env_p7a"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
